
# Data manipulation
# - Creating variables
# - Descriptive statistics
# - Checking the constraints (Fuel/Capacity)


## ----library -----

pacman::p_load(tidyverse,
               lubridate, 
               ggthemes,
               MASS, 
               stargazer,    # generate regression table
               foreign,      # for Stata data
               glmnet,       # Lasso estimation
               psych,        # for summary statistics
               knitr,        # kable
               kableExtra,   # fancy table
               Benchmarking, # DEA analysis
               frontier,     # SFA analysis
               estimatr,     # Robust Estimator
               patchwork,    # easy layout of ggplots
               extrafont,    # ggplot fonts
               huxtable,     # use huxtable
               recipes)      # to use discretize 


## ----data, include=FALSE,eval = TRUE-----------------------------------
# Loading data and manupilation
# dat3 is generated. 


# generated in "daily_decision_maeshori_ver4_27042020.R"
dat1 = read_csv("../../Data/ks_offshore_daily_clean3_ver4_27042020.csv",
                locale = locale(encoding = "UTF-8"))

# ===== Data restriction 1: -=========
# limit to Swordfish season only
# limit to before earthquake
dat2 = dat1 %>% 
  dplyr::filter(leave_mon >= 10 | leave_mon <= 3) %>%
  dplyr::filter(leave_year<=2011) %>%
  dplyr::filter(return_date < as.Date("2011-03-10"))


# ===== Name vectors for past days-catch interaction ====
names_days = paste("days_past",1:35,"ago",sep = "_")

names_SF_num = paste("SF_num",1:35,"ago",sep = "_")
names_BS_num = paste("BS_num",1:35,"ago",sep = "_")
names_SF_wt = paste("SF_wt",1:35,"ago",sep = "_")
names_BS_wt = paste("BS_wt",1:35,"ago",sep = "_")

names_days_SF_num = paste("days_SF_num",1:35,"ago",sep = "_")
names_days_BS_num = paste("days_BS_num",1:35,"ago",sep = "_")
names_days_SF_wt = paste("days_SF_wt",1:35,"ago",sep = "_")
names_days_BS_wt = paste("days_BS_wt",1:35,"ago",sep = "_")

names_days2_SF_num = paste("days2_SF_num",1:35,"ago",sep = "_")
names_days2_BS_num = paste("days2_BS_num",1:35,"ago",sep = "_")
names_days2_SF_wt = paste("days2_SF_wt",1:35,"ago",sep = "_")
names_days2_BS_wt = paste("days2_BS_wt",1:35,"ago",sep = "_")

# ==== Data restriction2: Only the trips with 15 or more operation days ======
dat2.5 = dat2 %>%
  group_by(trip_id) %>%
  mutate(trip_ope_num = max(ope_num)) %>%
  ungroup() %>%
  filter(trip_ope_num >= 15)

# ===== aggregate the data at trip level ==============

# Extract trip ID and trip-specific data
dat3.tripid = dat2.5 %>%
  distinct(trip_id,vname,call,trip_days,ope_length,
           leave_date,return_date,oil_price)

# Aggregate at trip level
dat3.trip = dat2.5 %>%
  group_by(trip_id) %>%
  summarise(exp_SF_price = mean(exp_SF_price,na.rm = T),
            exp_BS_price = mean(exp_BS_price,na.rm = T),
            SF_tot_wt = sum(sword_wt,na.rm = T),
            SF_tot_wt_interpo = sum(sword_wt_interpo,na.rm = T),
            SF_tot_num = sum(sword_num,na.rm = T),
            SF_unit_wt = ifelse(SF_tot_num == 0,NA,
                                SF_tot_wt/SF_tot_num),
            SF_unit_wt_interpo = ifelse(SF_tot_num == 0,NA,
                                        SF_tot_wt_interpo/SF_tot_num),
            BS_tot_wt = sum(bshark_wt,na.rm = T),
            BS_tot_wt_interpo = sum(bshark_wt_interpo,na.rm = T),
            BS_tot_num = sum(bshark_num,na.rm = T),
            BS_unit_wt = ifelse(BS_tot_num == 0,NA,
                                BS_tot_wt/BS_tot_num),
            BS_unit_wt_interpo = ifelse(BS_tot_num == 0,NA,
                                        BS_tot_wt_interpo/BS_tot_num),
            SF_fresh_wt_sum = sum(sword_wt*1/(ope_length - ope_num + 1)),
            BS_fresh_wt_sum = sum(bshark_wt*1/(ope_length - ope_num + 1)),
            SF_fresh_num_sum = sum(sword_num*1/(ope_length - ope_num + 1)),
            BS_fresh_num_sum = sum(bshark_num*1/(ope_length - ope_num + 1))
  ) %>%
  left_join(dat3.tripid, by = "trip_id") %>%
  mutate(leave_mon = as.numeric(lubridate::month(leave_date)),
         leave_year = as.numeric(lubridate::year(leave_date))) 


# Merge interporated data
dat3.trip2 = dat3.trip %>%
  # make log variables
  mutate(log_exp_SF_price = ifelse(exp_SF_price == 0, log(10^-323),
                                   log(exp_SF_price)),
         log_exp_BS_price = ifelse(exp_BS_price == 0, log(10^-323),
                                   log(exp_BS_price)),
         SF_cpue_wt = SF_tot_wt/ope_length,
         BS_cpue_wt = BS_tot_wt/ope_length,
         log_SF_cpue_wt = ifelse(SF_cpue_wt == 0, log(10^-323),
                                 log(SF_cpue_wt)),
         log_BS_cpue_wt = ifelse(BS_cpue_wt == 0, log(10^-323),
                                 log(BS_cpue_wt)),
         log_SF_unit_wt = ifelse(SF_unit_wt == 0, log(10^-323),
                                 log(SF_unit_wt)),
         log_SF_unit_wt_interpo = ifelse(SF_unit_wt_interpo == 0, log(10^-323),
                                         log(SF_unit_wt_interpo)),
         log_BS_unit_wt = ifelse(BS_unit_wt == 0, log(10^-323),
                                 log(BS_unit_wt)),
         log_BS_unit_wt_interpo = ifelse(BS_unit_wt_interpo == 0, log(10^-323),
                                         log(BS_unit_wt_interpo)),
         SF_cpue_num = SF_tot_num/ope_length,
         BS_cpue_num = BS_tot_num/ope_length,
         SF_rev_pue = exp_SF_price*SF_tot_wt/ope_length,
         BS_rev_pue = exp_BS_price*BS_tot_wt/ope_length,
         log_SF_rev_pue = ifelse(SF_rev_pue == 0, log(10^-323),
                                 log(SF_rev_pue)),
         log_BS_rev_pue = ifelse(BS_rev_pue == 0, log(10^-323),
                                 log(BS_rev_pue)),
         SF_fresh_wt_ope = SF_fresh_wt_sum/ope_length,
         log_SF_fresh_wt_ope = ifelse(SF_fresh_wt_ope == 0, log(10^-323),
                                      log(SF_fresh_wt_ope)),
         SF_fresh_wt_trip = SF_fresh_wt_sum/trip_days,
         SF_fresh_num_ope = SF_fresh_num_sum/ope_length,
         log_SF_fresh_num_ope = ifelse(SF_fresh_num_ope == 0, log(10^-323),
                                       log(SF_fresh_num_ope)),
         SF_fresh_num_trip = SF_fresh_num_sum/trip_days) %>%
  mutate_at(vars(starts_with("log")),funs(ifelse(. <= -Inf, -743.7469, .))) 

# Reference dependent check
dat3.trip.ref = dat2.5 %>%
  group_by(trip_id,vname,leave_year,leave_mon) %>%
  summarise(rev = max(cum_rev, na.rm = T),
            profit = max(cum_profit,na.rm = T)) %>%
  ungroup() %>%
  group_by(vname) %>%
  arrange(vname,leave_year,leave_mon) %>%
  mutate(rev_prev_own = if_else(!is.na(lag(rev,1)),lag(rev,1),rev),
         profit_prev_own = if_else(!is.na(lag(profit,1)),lag(profit,1),profit)) %>%
  ungroup() %>%
  group_by(leave_year,leave_mon) %>%
  mutate(rev_year_mon = mean(rev,ma.rm = T),
         profit_year_mon = mean(profit,na.rm = T)) %>%
  dplyr::select(-vname,-rev,-profit)

# merge reference-level data (not used for the main analysis)
dat3 = dat2.5 %>%
  left_join(dat3.trip.ref, by = c("trip_id","leave_year","leave_mon")) %>%
  mutate(dum_profit0 = if_else(cum_profit < 0, 0, 1),
         dum_profit_own = if_else(cum_profit < profit_prev_own, 0, 1),# relative to previous own profit
         dum_profit_ssn = if_else(cum_profit < profit_year_mon, 0, 1), # seasonal average profit
         dum_profit_pracebo1 = if_else(cum_profit < -1000, 0, 1),
         dum_profit_pracebo2 = if_else(cum_profit < 5000, 0, 1),
         dum_rev_own = if_else(cum_rev < rev_prev_own, 0, 1),# relative to previous own rev
         dum_rev_ssn = if_else(cum_rev < rev_year_mon, 0, 1), # seasonal average rev)
         dum_rev_pracebo1 = if_else(cum_rev < 12000, 0, 1), # cost per trip is about 1200 Man-JPY? http://www.80c.jp/food/20120903-85.html
         dum_rev_pracebo2 = if_else(cum_profit < 6000, 0, 1)) # random



## ----summary_daily, warning=FALSE, echo = FALSE,eval = TRUE------------
dat3.summ = dat3 %>%
  dplyr::select(##alb_num,alb_wt,
    ##bigeye_num, bigeye_wt,
    ##ytuna_num,ytuna_wt,
    sword_num,sword_wt_interpo,
    bshark_num,bshark_wt_interpo,
    sword_unit_wt,bshark_unit_wt,
    exp_SF_price,
    exp_BS_price) %>%
  dplyr::rename_(##"Albacore (n)" = "alb_num","Albacore (kg)" = "alb_wt",
    ##"Bigeye (n)" = "bigeye_num", "Bigeye (kg)" = "bigeye_wt",
    ##"Yellowfin (n)" = "ytuna_num", "Yellowfin (kg)" = "ytuna_wt",
    "Swordfish (count)" = "sword_num","Swordfish (kg)" = "sword_wt_interpo",
    "Blue shark (count)" = "bshark_num","Blueshark (kg)" = "bshark_wt_interpo",
    "Swordfish (kg/count)" = "sword_unit_wt",
    "Blue shark (kg/count)" = "bshark_unit_wt",
    "Expected swordfish price"="exp_SF_price",
    "Expected blueshark price"="exp_BS_price")

dat3.summ2 = psych::describe(dat3.summ) %>%
  dplyr::select(-vars,-range,-se,-skew,-kurtosis,-mad,-trimmed)%>%
  rename("N" = "n",
         "Mean" = "mean",
         "S.D." = "sd",
         "Median" = "median",
         "Min" = "min",
         "Max" = "max")

##kable(print(dat3.summ2, digits = 2), 
##      format = "markdown")



## ---- eval = TRUE------------------------------------------------------
kable(round(dat3.summ2, digits = 2),
      format = "latex", 
      escape = F,
      caption = "Summary statistics, daily level", 
      booktabs = T)%>%
  kable_styling(latex_options = "hold_position")


## ----summary_trip, echo = FALSE,eval = TRUE----------------------------
dat3.trip.summ = dat3.trip2 %>%
  dplyr::mutate(move_days = trip_days - ope_length) %>%
  dplyr::select(SF_tot_num,SF_tot_wt_interpo,BS_tot_num,BS_tot_wt_interpo,
                SF_unit_wt_interpo,BS_unit_wt_interpo,
                trip_days,ope_length,move_days,oil_price) %>%
  dplyr::rename_("Swordfish (count)" = "SF_tot_num",
                 "Swordfish (kg)" = "SF_tot_wt_interpo",
                 "Blue shark (count)" = "BS_tot_num",
                 "Blue shark (kg)" = "BS_tot_wt_interpo",
                 "Swordfish (kg/count)" = "SF_unit_wt_interpo",
                 "Blue shark (kg/count)" = "BS_unit_wt_interpo",
                 "Trip days" = "trip_days",
                 "Operation days" = "ope_length",
                 "Move/search days" = "move_days",
                 "Fuel price (JPY/L)" = "oil_price")
dat3.trip.summ2 = psych::describe(dat3.trip.summ) %>%
  dplyr::select(-vars,-range,-se,-skew,-kurtosis,-mad,-trimmed) %>%
  rename("N" = "n",
         "Mean" = "mean",
         "S.D." = "sd",
         "Median" = "median",
         "Min" = "min",
         "Max" = "max")




## ---- eval = TRUE------------------------------------------------------
kable(round(dat3.trip.summ2, digits = 2),
      format = "latex", 
      escape = F,
      caption = "Summary statistics, cruise level", 
      booktabs = T)%>%
  kable_styling(latex_options = "hold_position")


## ----const_setup, echo = FALSE, eval = TRUE----------------------------

# same as the calculation in the data, which is generated in "daily_decision_maeshori_ver4_27042020.R"
dat3.trip.const = dat3.trip2 %>%
  mutate(fuel_use = 1.64*ope_length + 2.80*(trip_days - ope_length),
         capa_wt = SF_tot_wt_interpo + BS_tot_wt_interpo,
         capa_num = SF_tot_num + BS_tot_num*(58.52/12.53)
  ) %>%
  mutate(
    fuel_use_rel = fuel_use/max(fuel_use),
    capa_wt_rel = capa_wt/max(capa_wt))

dat3.trip.const.2 = dat3.trip.const %>%
  dplyr::select(fuel_use,fuel_use_rel,capa_wt,capa_wt_rel,capa_num) %>%
  psych::describe() %>%
  dplyr::select(-vars,-range,-se,-skew,-kurtosis,-mad,-trimmed)

## kable(print(dat3.trip.const.2, digits = 2), format = "markdown")



## ----const_wt, echo = FALSE,eval = TRUE--------------------------------
# weight, raw number
hist_wt_const =  ggplot(dat3.trip.const, aes(x=capa_wt)) +
  geom_histogram(fill = "grey40",col = "grey20", binwidth = 3000) + 
  theme_bw() +
  labs(title = "Total weight per trip",
       x = "Total Weight per trip",
       y = "Count")

# weight, relative to the max
hist_wt_const_rel = ggplot(dat3.trip.const, aes(x=capa_wt_rel)) +
  geom_histogram(fill = "grey40",col = "grey20") + 
  theme_bw() +
  labs(title = "Relative weight",
       x = "Relative Total Weight per trip",
       y = "Count") + 
  theme(plot.title = element_text(size=16),
        text = element_text(family = "Times New Roman"))

hist_wt_const_by_ves = ggplot(dat3.trip.const, aes(x=capa_wt)) +
  geom_histogram(fill = "grey20", binwidth = 2000) +
  facet_wrap("call") + 
  labs(title = "Total Weight Per Trip by vessel",
       x = "Total Weight per trip",
       y = "Count")

# = for presentation ===
# weight, raw number
if(FALSE){
  png("/Users/keita/Dropbox/UW/Research/Daily/Rcode/Const_weight.png", width = 1200, height = 675)
  ggplot(dat3.trip.const, aes(x=capa_wt)) +
    geom_histogram(fill = "grey40",col = "grey20", binwidth = 3000) + 
    theme_stata() +
    theme(title = element_text(size = 25),
          axis.title.x = element_text(size=20),
          axis.title.y = element_text(size=20),
          axis.text.x = element_text(size=20),
          axis.text.y = element_text(size=20)) +
    labs(title = "Total weight per trip",
         x = "\n Total Weight per trip",
         y = "Count\n")
  dev.off()
}

# weight, raw number, by vesssel
if(FALSE){
  png("/Users/keita/Dropbox/UW/Research/Daily/Rcode/Const_weight_ves.png", width = 1200, height = 675)
  ggplot(dat3.trip.const, aes(x=capa_wt)) +
    geom_histogram(fill = "grey40",col = "grey20", binwidth = 3000) + 
    facet_wrap("call") + 
    theme_stata() +
    theme(title = element_text(size = 25),
          axis.title.x = element_text(size=20),
          axis.title.y = element_text(size=20),
          axis.text.x = element_text(size=10),
          axis.text.y = element_text(size=15)) +
    labs(title = "Total weight per trip by vessel",
         x = "\n Total Weight per trip",
         y = "Count\n")
  dev.off()
}



## ----const_fuel, echo = FALSE, eval=TRUE-------------------------------
hist_fuel_const = ggplot(dat3.trip.const, aes(x=fuel_use)) +
  geom_histogram(fill = "grey40",col = "grey20",binwidth = 5) + 
  theme_bw() +
  labs(title = "Fuel use per trip",
       x = "Fuel Use (KL/trip)",
       y = "Count\n") + 
  theme(plot.title = element_text(size=16),
        text = element_text(family = "Times New Roman"))

hist_fuel_const_by_ves = ggplot(dat3.trip.const, aes(x=fuel_use)) +
  geom_histogram(fill = "grey20",binwidth = 10) +
  facet_wrap("call") + 
  theme_bw() +
  labs(title = "Fuel use by vessel",
       x = "Fuel Use (KL/trip)",
       y = "Count")


# Fuel Constraint (Presentation)
if(FALSE){
  png("/Users/keita/Dropbox/UW/Research/Daily/Rcode/Const_fuel.png", width = 1200, height = 675)
  
  ggplot(dat3.trip.const, aes(x=fuel_use)) +
    geom_histogram(fill = "grey40",col = "grey20",binwidth = 5) + 
    theme_stata() +
    labs(title = "Fuel Use per trip",
         x = "\n Fuel Use (KL/trip)",
         y = "Count\n") +
    theme(title = element_text(size = 25),
          axis.title.x = element_text(size=20),
          axis.title.y = element_text(size=20),
          axis.text.x = element_text(size=20),
          axis.text.y = element_text(size=20))
  
  dev.off()
}

# fuel, by vesssel (Presentation)
if(FALSE){
  png("/Users/keita/Dropbox/UW/Research/Daily/Rcode/Const_fuel_ves.png", width = 1200, height = 675)
  ggplot(dat3.trip.const, aes(x=capa_wt)) +
    geom_histogram(fill = "grey40",col = "grey20", binwidth = 3000) + 
    facet_wrap("call") + 
    theme_stata() +
    theme(title = element_text(size = 25),
          axis.title.x = element_text(size=20),
          axis.title.y = element_text(size=20),
          axis.text.x = element_text(size=10),
          axis.text.y = element_text(size=15)) +
    labs(title = "Fuel Use per trip by vessel",
         x = "\n Fuel Use (KL/trip)",
         y = "Count\n")
  dev.off()
}



## ---- eval = TRUE, fig.cap="\\label{fig:histograms}Histograms of capacity and fuel use",fig.width=6,fig.height=3----
hist_wt_const_rel + hist_fuel_const + plot_annotation(tag_levels = 'A')


## ---- eval = FALSE-----------------------------------------------------
## # figure for publication, eps
## cairo_ps("figures/pub/histograms.eps", width = 6, height = 3)
##   hist_wt_const_rel + hist_fuel_const + plot_annotation(tag_levels = 'A')
## dev.off()


## ----daily_var, eval = TRUE--------------------------------------------

dat_day_catch =  dat3 %>% group_by(trip_id) %>% 
  summarise_at(vars(sword_num),
               list(~mean(.),~sd(.))) %>%
  ungroup() %>%
  summarise_at(vars(mean,sd),
               list(~mean(.)))

# plot: daily catch, all data
daily_catch = ggplot(dat3, aes(x = ope_num, y = sword_wt_interpo,group= ope_num)) + 
  geom_boxplot(outlier.size = 0.3) + 
  labs(x = "Number of operation", y = "Swordfish weight (kg)",
       title = " (A) Daily catch of swordfish operation") +
  theme_bw() + 
  theme(plot.title = element_text(size = 11),
        text = element_text(family = "Times New Roman"))

# plot: arbitral (first 3) trips to show the randomness of catch during a trip
daily_catch_var = ggplot(dat3 %>% filter(trip_id %in% c(1:3)), aes(x = ope_num, y = sword_wt_interpo,col = factor(trip_id), group = factor(trip_id))) + 
  geom_point(size = 0.9) + 
  geom_line() + 
  labs(x = "Number of operation", y = "Swordfish weight (kg)",
       title = "(B) Catch per operation, some trips",
       col = "Trip") + 
  theme_bw() +
  theme(legend.position = c(0.16, 0.84),
        legend.title = element_text(size = 9),
        legend.text=element_text(size=9),
        #legend.spacing = unit(2.0, 'mm'),
        legend.key.size = unit(2.0, 'mm'),
        plot.title = element_text(size = 11),
        text = element_text(family = "Times New Roman"))


# using days at sea (given REFEREE comment)
daily_catch_days_at_sea = ggplot(dat3, aes(x = days_past, y = sword_num,group= days_past)) + 
  geom_boxplot(outlier.size = 0.3) + 
  labs(x = "Days at Sea", y = "Swordfish (number)",
       title = "") +
  theme_bw() + 
  theme(plot.title = element_text(size = 11),
        text = element_text(family = "Times New Roman"))

# using days at sea (given REFEREE comment)
daily_cum_catch_days_at_sea = ggplot(dat3, aes(x = days_past, y = sword_num_cum, group= days_past)) + 
  geom_boxplot(outlier.size = 0.3) + 
  labs(x = "Days at Sea", y = "Swordfish (number)",
       title = "") +
  theme_bw() + 
  theme(plot.title = element_text(size = 11),
        text = element_text(family = "Times New Roman"))

# plot: arbitral (first 3) trips to show the randomness of catch during a trip
# using days at sea given REFEREE comments
daily_catch_var_days_at_sea = ggplot(dat3 %>% filter(trip_id %in% c(1:3)), aes(x = days_past, y = sword_wt_interpo,col = factor(trip_id), group = factor(trip_id))) + 
  geom_point(size = 0.9) + 
  geom_line() + 
  labs(x = "Days at Sea", y = "Swordfish weight (kg)",
       title = "(B) Catch per operation, some trips",
       col = "Trip") + 
  theme_bw() +
  theme(legend.position = c(0.16, 0.84),
        legend.title = element_text(size = 9),
        legend.text=element_text(size=9),
        #legend.spacing = unit(2.0, 'mm'),
        legend.key.size = unit(2.0, 'mm'),
        plot.title = element_text(size = 11),
        text = element_text(family = "Times New Roman"))




## ---- eval = FALSE, fig.cap="\\label{fig:dailyvar}Variations of daily swordfish catch. The panel A shows the whole data while the panel B shows some arbitrally chosen trips to highlight the variation within a trip.",fig.width = 6,fig.height=3----
## 
## # given the REFEREE comments, this is eval = FALSE. using the days-at-sea in the chunk below.
## 
## ## gridExtra::grid.arrange(daily_catch,daily_catch_var,nrow = 1)
## 
## daily_catch + daily_catch_var


## ---- eval = TRUE, fig.cap="\\label{fig:dailyvar2}Variations of daily swordfish catch. The panel (A) shows the daily catch and the panel (B) shows the daily cumulative number of catch.",fig.width = 6,fig.height=3----

# using patchwork
daily_catch_days_at_sea + daily_cum_catch_days_at_sea  + plot_annotation(tag_levels = 'A')



## ---- eval = FALSE-----------------------------------------------------
## # figure for publication, eps
## cairo_ps("figures/pub/dailyvar2.eps", width = 6, height = 3)


## ---- Export Data ---------------------

write.csv(dat3,"../../Data/daily_dat3.csv")



##   daily_catch_days_at_sea + daily_cum_catch_days_at_sea  + plot_annotation(tag_levels = 'A')
## dev.off()